### Replication Package for "Why is Intermediating Houses so Difficult? Evidence from iBuyers"
### Buchak, Matvos, Piskorski, and Seru
###
###
### buchak@stanford.edu

### This file creates the CoreLogic analysis data


library(data.table)
library(ggplot2)
library(zoo)
library(Hmisc)
library(stringr)

### Toggle to T to use sharable sample data ###
SHARABLE = T 

raw.data.path = '../data/raw/corelogic/'
processed.data.path = '../data/processed/corelogic'
sharable.processed.data.path = '../data/processed/corelogic/share/'
sharable.raw.data.path = '../data/raw/corelogic/share/'

create_sharable_raw_data <- function() {
  # Creates a shareable permuted sample for public-use replication package.
  
  # Columns to save
  fields = c('pclidirisfrmtd','fipscode','transactiontype','propertytype','corporateindicator','owner1lastname','owner1firstnamemi','owneretalindicator','resalenewconstruction',
             'absenteeownerstatus','situscity','situszipcode','parcellevellatitude','parcellevellongitude','sellerlastname','sellername1','saleamount','saledate',
             'lendercompanycode','mortgageamount','mortgageloantypecode','lenderlastname','foreclosure','equityflag','refiflag',
             'censustract','landsquarefootage','universalbuilding_sqft','living_sqft','yearbuilt','airconditioning',
             'garage','heating','quality','roofcover','storiesnum','locationinfluence','electric','sewer','water')
  
  # Raw data files from Corelogic
  files <- c(
    'MaricopaOpendoorBase1980_1989.csv',
    'MaricopaOpendoorBase1990_1999.csv',
    'MaricopaOpendoorBase2000_2009.csv',
    'MaricopaOpendoorBase2010_2019.csv',
    '32003OpendoorBase1990_1999.csv',
    '32003OpendoorBase2000_2009.csv',
    '32003OpendoorBase2010_2019.csv',
    '12095OpendoorBase.csv',
    '48113OpendoorBase.csv',
    '13135OpendoorBase.csv'
  )
  
  # Loop through data, sample and randomize, and write
  for(ff in files) {
    print(paste0('Creating sharable data from ',ff))
    data.in <- fread(paste0(raw.data.path,ff))[,fields,with=F]
    sampled.data <- sample_and_scramble(data.in,frac = 0.20,seed = 12301986)
    write.table(file=paste0(sharable.raw.data.path,ff),x=sampled.data,row.names=F,col.names=T)
  }  
  

  
  
}

prepData <- function(Market = 'Phoenix') {
  
  print(paste('Prepping data for',Market))
  
  fields = c('pclidirisfrmtd','fipscode','transactiontype','propertytype','corporateindicator','owner1lastname','owner1firstnamemi','owneretalindicator','resalenewconstruction',
             'absenteeownerstatus','situscity','situszipcode','parcellevellatitude','parcellevellongitude','sellerlastname','sellername1','saleamount','saledate',
             'lendercompanycode','mortgageamount','mortgageloantypecode','lenderlastname','foreclosure','equityflag','refiflag',
             'censustract','landsquarefootage','universalbuilding_sqft','living_sqft','yearbuilt','airconditioning',
             'garage','heating','quality','roofcover','storiesnum','locationinfluence','electric','sewer','water')
  
  # iBuyers
  iBuyers = paste(c('opendoor','open door','\\<od [a-z].*',
                    'offerpad','offer pad',
                    'redfin','red fin',
                    'zillow',
                    'atlas',
                    'knock'),collapse = '|')
  
  # From Buchak et al (2018)
  fintechs = paste(c('quicken','amerisave','guaranteed','ntfn','internet',
                     '21st','first residential','movement','ne moves',
                     'nutter and company','cashcall','summit','cardinal','homeward'),collapse = '|')
  
  
  if(SHARABLE) {
    path = sharable.raw.data.path
  } else {
    path = raw.data.path
  }
  
  if(Market == 'Phoenix') {
    d.1 <- fread(paste0(path,'MaricopaOpendoorBase1980_1989.csv'))[,fields,with=F]
    d.2 <- fread(paste0(path,'MaricopaOpendoorBase1990_1999.csv'))[,fields,with=F]
    d.3 <- fread(paste0(path,'MaricopaOpendoorBase2000_2009.csv'))[,fields,with=F]
    d.4 <- fread(paste0(path,'MaricopaOpendoorBase2010_2019.csv'))[,fields,with=F]
    data <- rbind(d.1,d.2,d.3,d.4)
    rm(list=c('d.1','d.2','d.3','d.4'))
  } else if(Market == 'Las Vegas') {
    d.2 <- fread(paste0(path,'32003OpendoorBase1990_1999.csv'))[,fields,with=F]
    d.3 <- fread(paste0(path,'32003OpendoorBase2000_2009.csv'))[,fields,with=F]
    d.4 <- fread(paste0(path,'32003OpendoorBase2010_2019.csv'))[,fields,with=F]
    data <- rbind(d.2,d.3,d.4)
    rm(list=c('d.2','d.3','d.4'))
  } else if(Market == 'Orlando'){
    data <- fread(paste0(path,'12095OpendoorBase.csv'))[,fields,with=F]
  } else if(Market == 'Dallas') {
    data <- fread(paste0(path,'48113OpendoorBase.csv'))[,fields,with=F]
  } else if(Market == 'Gwinnett') {
    data <- fread(paste0(path,'13135OpendoorBase.csv'))[,fields,with=F]
  }
  
  ## Record the market
  data[,Market := Market]
  
  ## Filter out non-single family homes. If missing, impute from recordings about the same property
  data[,propertytype := as.numeric(propertytype)]
  data[,single.family := max(as.numeric(propertytype==10),na.rm=T),by='pclidirisfrmtd']
  data[,condo         := max(as.numeric(propertytype==11),na.rm=T),by='pclidirisfrmtd']
  data <- data[single.family == 1 | condo == 1]

  ## Fix dates---only use the sale date. If no sale date it's not a sale...
  data[,date := saledate]
  data[date %% 100 == 0,date := NA]
  data[,date := as.Date(as.character(date),format = '%Y%m%d')]
  data$saledate <- NULL
  data[,year := year(date)]
  data <- data[!is.na(date)]
  
  ## Create Buyer / Seller Name
  data[,buyername := paste(owner1firstnamemi,owner1lastname)]
  data[,sellername := sellerlastname]
  data[sellername == '',sellername := sellername1]
  data[buyername == ' ',buyername := 'unknown']
  data[sellername == '',sellername := 'unknown']
  
  ## Only take arms' length transactions of existing homes or sales of new constructions.
  ## Primariliy filters out nominal transfers & mortgage refinances 
  data <- data[(transactiontype == 1 & resalenewconstruction == 'M') | (transactiontype == 3 & resalenewconstruction == 'N')]
  
  ## Create simple-id and remove duplicated transactions
  data[,id := paste(Market,pclidirisfrmtd)]
  data <- data[!duplicated(paste(date,id,buyername,sellername))]
  
  ## Identify iBuyers and fintech lenders
  data[,iBuyer.buyer  := as.integer(grepl(buyername,pattern=iBuyers,ignore.case=T))]
  data[,iBuyer.seller := as.integer(grepl(sellername,pattern=iBuyers,ignore.case=T))]
  data[,fintech.lender := as.integer(grepl(lenderlastname,pattern=fintechs,ignore.case=T))]
  
  ## Get lagged buyers / sellers
  data <- data[order(id,date)]
  data[,ltv := mortgageamount / saleamount]
  data[,seller.buy.date       := shift(date,type = 'lag'),by='id']
  data[,seller.buy.amount     := shift(saleamount,type= 'lag' ),by='id']
  data[,seller.fintech.lender := shift(fintech.lender,type= 'lag' ),by='id']
  data[,seller.ltv            := shift(ltv,type= 'lag' ),by='id']
  data[,seller.foreclosed     := shift(foreclosure,type= 'lag' ),by='id']
  data[,buyer.foreclosed      := shift(foreclosure,type= 'lead' ),by='id']
  
  # Fix names----going to take selerlastname and owner1lastname as defaults, and if they're missing, take the LAST STRING of buyer and seller name, 
  # unless it's is a corporate name like 'LLC,' or a common, similar word.
  
  # Corp names
  corpNames = c('CORP|LLC|INC|LP|SALES|CONSTRUCTION|TRUST|INC|unknown|HOMES|FNAM|DEV|JR|HUD|LTD|C|INS|MAE|ASSN|MARKETING|RECONVEYANCE|BK|FSB|CORPORATION|PROPERTIES|ADMN|ASSOCIATES|BANK|AMERICA|CORPS|DEVELOPMENT|AFFAIRS|INVESTMENTS|COUNTY|CORPORAT|HMS')
  
  data[,seller_lastname_clean := gsub(sellerlastname,pattern=corpNames,replace='')]
  data[,seller_lastname_clean := trimws(seller_lastname_clean)]
  data[seller_lastname_clean == '', seller_lastname_clean := trimws(word(trimws(gsub(sellername,pattern=corpNames,replace='')),-1))]
  data[seller_lastname_clean == '', seller_lastname_clean := NA]
  
  data[,buyer_lastname_clean := gsub(owner1lastname,pattern=corpNames,replace='')]
  data[,buyer_lastname_clean := trimws(buyer_lastname_clean)]
  data[buyer_lastname_clean == '', buyer_lastname_clean := trimws(word(trimws(gsub(buyername,pattern=corpNames,replace='')),-1))]
  data[buyer_lastname_clean == '', buyer_lastname_clean := NA]
  
  
  
  if(SHARABLE) {
    write.table(data,file=paste0(sharable.processed.data.path,'/',gsub(Market,pattern=' ',replacement = '_'),'_processed.csv'),row.names=F,col.names=T)
  } else {
    write.table(data,file=paste0(processed.data.path,'/',gsub(Market,pattern=' ',replacement = '_'),'_processed.csv'),row.names=F,col.names=T)
  }
}


processMarkets <- function() {
  markets = c('Phoenix','Orlando','Dallas','Las Vegas','Gwinnett')
  for(mm in markets) {
    prepData(mm)
  }
  combineData()
}

combineData <- function() {
  
  if(SHARABLE) {
    phoenix.in   <- fread(paste0(sharable.processed.data.path,'/Phoenix_processed.csv'))
    orlando.in   <- fread(paste0(sharable.processed.data.path,'/Orlando_processed.csv'))
    dallas.in    <- fread(paste0(sharable.processed.data.path,'/Dallas_processed.csv'))
    las.vegas.in    <- fread(paste0(sharable.processed.data.path,'/Las_Vegas_processed.csv'))
    gwinnett.in    <- fread(paste0(sharable.processed.data.path,'/Gwinnett_processed.csv'))
    data.in <- rbind(phoenix.in,orlando.in,dallas.in,las.vegas.in,gwinnett.in) 
    write.table(data.in,paste0(sharable.processed.data.path,'/Combined_processed.csv'),row.names=F,col.names=T)
  } else {
    phoenix.in   <- fread(paste0(processed.data.path,'/Phoenix_processed.csv'))
    orlando.in   <- fread(paste0(processed.data.path,'/Orlando_processed.csv'))
    dallas.in    <- fread(paste0(processed.data.path,'/Dallas_processed.csv'))
    las.vegas.in    <- fread(paste0(processed.data.path,'/Las_Vegas_processed.csv'))
    gwinnett.in    <- fread(paste0(processed.data.path,'/Gwinnett_processed.csv'))
    data.in <- rbind(phoenix.in,orlando.in,dallas.in,las.vegas.in,gwinnett.in) 
    write.table(data.in,paste0(processed.data.path,'/Combined_processed.csv'),row.names=F,col.names=T)
  }
}

sample_and_scramble <- function(DT, frac = 0.1, seed = 0) {
  stopifnot(is.data.table(DT))
  if (!is.null(seed)) set.seed(seed)
  
  # Sampling is done by pclidirisfrmtd (lots of work is dont with property fixed effects, or properties over time)
  all.ids <- unique(DT$pclidirisfrmtd)
  
  # 1) sample ~1% of rows (at least 1 row)
  n.ids <- length(all.ids)
  sampled.ids <- sample(all.ids,size = floor(n.ids * frac),replace = F)
  
  out <- DT[pclidirisfrmtd %in% sampled.ids]
  n = nrow(out)
  
  # drop keys to avoid surprises
  setkey(out, NULL)
  
  # 2) independently permute each column (leaving fips and id intact)
  for (j in 3:ncol(out)) {
    set(out, j = j, value = out[[j]][sample.int(n)])
  }
  
  out
}


#create_sharable_raw_data() Creates sharable (randomized) data from raw corelogic data (not accessible publicly)
processMarkets()



